import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
# import plotly.express as px
# from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import datetime
import warnings
warnings.filterwarnings("ignore")
# from catboost import Pool, CatBoostRegressor
# import plotly.io as pio
# pio.renderers.default = 'colab'
import scipy as sp
import scipy.stats as stats
import lightgbm as lgb
import shap
# from statsmodels.regression.quantile_regression import QuantReg
# import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from bayes_opt import BayesianOptimization
from sklearn import base
from sklearn.model_selection import KFold,TimeSeriesSplit,RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.metrics import classification_report,plot_confusion_matrix, average_precision_score, \
plot_precision_recall_curve,roc_auc_score,roc_curve, auc,log_loss,\
confusion_matrix,precision_recall_curve,PrecisionRecallDisplay,brier_score_loss
from sklearn.calibration import calibration_curve
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import ExtraTreesRegressor
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
from functools import partial
Project Overview
Build a ML model to predict if total delivery duration seconds is larger than 2400 (i.e., 40 mins), defined as the time from order creation by customers (created_at) to delivery (actual_delivery_time).
Detailed dictionary of the data is in the appendix.
Note
I changed the original dataset.
It's unclear whether precision or recall is more important, so I will use
Also, we are also interested in the precision of prediction probability.
def convert_type(df,has_actual=False):
"""This function change data to appropriate types"""
df = df.copy()
df['created_at'] = pd.to_datetime(df['created_at'],utc=True)
df['created_at_pst']=df['created_at'].dt.tz_convert("America/Los_Angeles")
# categorical varibles
for cat in ['market_id','store_id','order_protocol']:
df[cat] = df[cat].astype('Int64').astype('category')
df['store_primary_category']=df['store_primary_category'].astype('category')
df['created_day_of_week'] = (df['created_at'].dt.dayofweek + 1).astype('category')
df['created_month'] = df['created_at'].dt.month.astype('category')
df['created_hour'] = df['created_at'].dt.hour.astype('category')
df['created_hour_pst'] = df['created_at_pst'].dt.hour.astype('category')
df['time_slot_pst'] = pd.qcut(df['created_hour_pst'],6).astype('category')
if has_actual:
df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'],utc=True)
df['duration'] = (df['actual_delivery_time'] - df['created_at']).dt.total_seconds()
df['y'] = df['duration']>=2400
return df.sort_values('created_at')
df = convert_type(pd.read_csv('historical_data.csv'),has_actual=True)
# df_submit = convert_type(pd.read_csv('predict_data.csv'),has_actual=False)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 197428 entries, 2690 to 61787 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 196441 non-null category 1 created_at 197428 non-null datetime64[ns, UTC] 2 actual_delivery_time 197421 non-null datetime64[ns, UTC] 3 store_id 197428 non-null category 4 store_primary_category 192668 non-null category 5 order_protocol 196433 non-null category 6 total_items 197428 non-null int64 7 subtotal 197428 non-null int64 8 num_distinct_items 197428 non-null int64 9 min_item_price 197428 non-null int64 10 max_item_price 197428 non-null int64 11 total_onshift_dashers 181166 non-null float64 12 total_busy_dashers 181166 non-null float64 13 total_outstanding_orders 181166 non-null float64 14 estimated_order_place_duration 197428 non-null int64 15 estimated_store_to_consumer_driving_duration 196902 non-null float64 16 created_at_pst 197428 non-null datetime64[ns, America/Los_Angeles] 17 created_day_of_week 197428 non-null category 18 created_month 197428 non-null category 19 created_hour 197428 non-null category 20 created_hour_pst 197428 non-null category 21 time_slot_pst 197428 non-null category 22 duration 197421 non-null float64 23 y 197428 non-null bool dtypes: bool(1), category(9), datetime64[ns, America/Los_Angeles](1), datetime64[ns, UTC](2), float64(5), int64(6) memory usage: 25.0 MB
There is one item standing out, it is created on 2014-10-19, but the others are all in 2015! This must be a data error, so I will drop it.
## Historical averages
df['created_at'].sort_values()
2690 2014-10-19 05:24:15+00:00
43519 2015-01-21 15:22:03+00:00
148754 2015-01-21 15:31:51+00:00
187014 2015-01-21 15:39:16+00:00
10265 2015-01-21 15:40:42+00:00
...
176616 2015-02-18 05:57:51+00:00
100474 2015-02-18 05:58:07+00:00
191692 2015-02-18 05:59:01+00:00
168114 2015-02-18 05:59:23+00:00
61787 2015-02-18 06:00:44+00:00
Name: created_at, Length: 197428, dtype: datetime64[ns, UTC]
df.loc[2690,:]
market_id 1 created_at 2014-10-19 05:24:15+00:00 actual_delivery_time 2015-01-25 19:11:54+00:00 store_id 3560 store_primary_category italian order_protocol 1 total_items 1 subtotal 1695 num_distinct_items 1 min_item_price 1595 max_item_price 1595 total_onshift_dashers NaN total_busy_dashers NaN total_outstanding_orders NaN estimated_order_place_duration 446 estimated_store_to_consumer_driving_duration 412.0 created_at_pst 2014-10-18 22:24:15-07:00 created_day_of_week 7 created_month 10 created_hour 5 created_hour_pst 22 time_slot_pst (19.0, 23.0] duration 8516859.0 y True Name: 2690, dtype: object
df.drop(index=2690,inplace=True)
df['created_month']=df['created_month'].cat.remove_categories([10])
Now, check the top few records with the highest duration. There are two records taking more than one day! That's probably due to reservation (i.e., submit order ahead of time). I don't think they are the focus of this study, since we focus on real-time same-day deliver (I assume for simplicity). So, I will drop them as well
df.sort_values("duration",ascending=False).head(20)[['created_at','actual_delivery_time','store_primary_category','duration','estimated_order_place_duration','estimated_store_to_consumer_driving_duration']]
| created_at | actual_delivery_time | store_primary_category | duration | estimated_order_place_duration | estimated_store_to_consumer_driving_duration | |
|---|---|---|---|---|---|---|
| 185550 | 2015-01-28 08:34:06+00:00 | 2015-02-01 16:25:25+00:00 | dessert | 373879.0 | 251 | 476.0 |
| 27189 | 2015-02-16 02:24:09+00:00 | 2015-02-19 22:45:31+00:00 | indian | 332482.0 | 251 | 767.0 |
| 83055 | 2015-02-01 02:18:07+00:00 | 2015-02-01 18:08:39+00:00 | burger | 57032.0 | 251 | 602.0 |
| 190860 | 2015-02-16 02:31:05+00:00 | 2015-02-16 17:38:32+00:00 | indian | 54447.0 | 251 | 326.0 |
| 86952 | 2015-02-05 02:11:40+00:00 | 2015-02-05 15:34:38+00:00 | thai | 48178.0 | 251 | 787.0 |
| 76743 | 2015-02-15 04:17:35+00:00 | 2015-02-15 16:59:00+00:00 | pizza | 45685.0 | 446 | 540.0 |
| 105825 | 2015-02-08 04:07:51+00:00 | 2015-02-08 15:03:43+00:00 | alcohol | 39352.0 | 251 | 315.0 |
| 66787 | 2015-02-04 20:21:30+00:00 | 2015-02-05 07:02:27+00:00 | italian | 38457.0 | 446 | 648.0 |
| 175971 | 2015-02-12 20:25:17+00:00 | 2015-02-13 07:01:00+00:00 | mexican | 38143.0 | 446 | 594.0 |
| 139989 | 2015-01-21 20:56:51+00:00 | 2015-01-22 07:00:07+00:00 | mediterranean | 36196.0 | 251 | 684.0 |
| 51228 | 2015-02-02 21:53:08+00:00 | 2015-02-03 07:01:57+00:00 | cafe | 32929.0 | 446 | 829.0 |
| 63505 | 2015-01-24 08:19:17+00:00 | 2015-01-24 17:24:07+00:00 | american | 32690.0 | 251 | 730.0 |
| 31185 | 2015-02-14 22:23:13+00:00 | 2015-02-15 07:19:12+00:00 | japanese | 32159.0 | 251 | 859.0 |
| 109875 | 2015-01-27 18:18:24+00:00 | 2015-01-28 02:10:29+00:00 | italian | 28325.0 | 446 | 869.0 |
| 39989 | 2015-01-28 22:48:43+00:00 | 2015-01-29 06:38:50+00:00 | american | 28207.0 | 446 | 533.0 |
| 171404 | 2015-01-28 22:41:45+00:00 | 2015-01-29 06:13:08+00:00 | salad | 27083.0 | 251 | 246.0 |
| 29715 | 2015-02-14 21:08:06+00:00 | 2015-02-15 04:14:44+00:00 | mexican | 25598.0 | 251 | 507.0 |
| 103937 | 2015-01-25 01:00:54+00:00 | 2015-01-25 07:55:35+00:00 | other | 24881.0 | 251 | 798.0 |
| 79473 | 2015-02-15 20:28:16+00:00 | 2015-02-16 03:08:11+00:00 | catering | 23995.0 | 446 | 576.0 |
| 182592 | 2015-02-12 00:36:10+00:00 | 2015-02-12 06:58:02+00:00 | smoothie | 22912.0 | 251 | 916.0 |
df.drop(index=[185550,27189],inplace=True)
Strangely, there are 7 records with missing delivery time. Sometimes, it may be due to censoring issue that the most recent orders' delivery time is not observed yet. However, this is not the case here, since these 7 orders span across different days (it is hard to imagine it takes more than 1 day to deliver!).
As a result, this may just be a data issue and can be ignored.
df[df.actual_delivery_time.isna()].T
| 158967 | 170416 | 7670 | 109 | 78511 | 140635 | 115982 | |
|---|---|---|---|---|---|---|---|
| market_id | 2 | 5 | 2 | 3 | 4 | 2 | 4 |
| created_at | 2015-02-01 01:21:29+00:00 | 2015-02-01 01:36:33+00:00 | 2015-02-08 02:54:42+00:00 | 2015-02-10 21:51:54+00:00 | 2015-02-15 02:15:45+00:00 | 2015-02-15 02:21:42+00:00 | 2015-02-16 01:52:49+00:00 |
| actual_delivery_time | NaT | NaT | NaT | NaT | NaT | NaT | NaT |
| store_id | 314 | 2651 | 2340 | 1698 | 901 | 1661 | 1107 |
| store_primary_category | mexican | fast | japanese | sandwich | catering | dessert | pizza |
| order_protocol | 5 | 4 | 2 | 3 | 1 | 1 | 3 |
| total_items | 5 | 3 | 4 | 1 | 9 | 3 | 2 |
| subtotal | 3447 | 982 | 2860 | 1125 | 5050 | 4210 | 2094 |
| num_distinct_items | 3 | 3 | 3 | 1 | 6 | 3 | 2 |
| min_item_price | 225 | 165 | 390 | 975 | 375 | 865 | 599 |
| max_item_price | 1349 | 575 | 690 | 975 | 1125 | 1850 | 1195 |
| total_onshift_dashers | 90.0 | 41.0 | 131.0 | 7.0 | 91.0 | 123.0 | 53.0 |
| total_busy_dashers | 88.0 | 31.0 | 123.0 | 5.0 | 75.0 | 91.0 | 53.0 |
| total_outstanding_orders | 109.0 | 31.0 | 197.0 | 4.0 | 167.0 | 176.0 | 102.0 |
| estimated_order_place_duration | 251 | 251 | 251 | 251 | 446 | 446 | 251 |
| estimated_store_to_consumer_driving_duration | 572.0 | 333.0 | 723.0 | 488.0 | 770.0 | 862.0 | 433.0 |
| created_at_pst | 2015-01-31 17:21:29-08:00 | 2015-01-31 17:36:33-08:00 | 2015-02-07 18:54:42-08:00 | 2015-02-10 13:51:54-08:00 | 2015-02-14 18:15:45-08:00 | 2015-02-14 18:21:42-08:00 | 2015-02-15 17:52:49-08:00 |
| created_day_of_week | 7 | 7 | 7 | 2 | 7 | 7 | 1 |
| created_month | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| created_hour | 1 | 1 | 2 | 21 | 2 | 2 | 1 |
| created_hour_pst | 17 | 17 | 18 | 13 | 18 | 18 | 17 |
| time_slot_pst | (15.0, 17.0] | (15.0, 17.0] | (17.0, 18.0] | (12.0, 15.0] | (17.0, 18.0] | (17.0, 18.0] | (15.0, 17.0] |
| duration | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| y | False | False | False | False | False | False | False |
# drop NaN duration times
df = df[df.duration.notna()].copy()
The other records looks reasonable.
There are negative values in 'min_item_price',total_onshift_dashers','total_busy_dashers', and 'total_outstanding_orders', which doesn't make sense. Since they are few, I set them to NaN.
for v in ['min_item_price','total_onshift_dashers','total_busy_dashers', 'total_outstanding_orders']:
print(f"There are {df[df[v]<0].shape[0]} obs with negative values of {v}.\n")
There are 13 obs with negative values of min_item_price. There are 21 obs with negative values of total_onshift_dashers. There are 21 obs with negative values of total_busy_dashers. There are 44 obs with negative values of total_outstanding_orders.
For now, I will change them to nan, but if I can, I should check if these entries are incorrect.
for v in ['min_item_price','total_onshift_dashers','total_busy_dashers', 'total_outstanding_orders']:
df.loc[df[v]<0,v] = np.nan
# df_submit.loc[df_submit[v]<0,v] = np.nan
The first thing is to check if there are many missing values.
The only concerns are the high percentage (8%) of missing values for total_onshift_dashers, total_busy_dashers and total_outstanding_orders.
df.isna().mean()*100
market_id 0.499954 created_at 0.000000 actual_delivery_time 0.000000 store_id 0.000000 store_primary_category 2.411128 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 8.247475 total_busy_dashers 8.247475 total_outstanding_orders 8.259125 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.266440 created_at_pst 0.000000 created_day_of_week 0.000000 created_month 0.000000 created_hour 0.000000 created_hour_pst 0.000000 time_slot_pst 0.000000 duration 0.000000 y 0.000000 dtype: float64
# df_submit.isna().mean()*100
Is is because zeros values are regarded as missing? Let's see:
for x in ['total_onshift_dashers','total_busy_dashers','total_outstanding_orders']:
print(df[x].value_counts())
0.0 3614
18.0 2924
15.0 2912
21.0 2841
19.0 2824
...
165.0 1
159.0 1
164.0 1
163.0 1
171.0 1
Name: total_onshift_dashers, Length: 168, dtype: int64
0.0 4170
10.0 3114
13.0 3052
6.0 3040
18.0 3001
...
150.0 2
152.0 2
153.0 1
154.0 1
149.0 1
Name: total_busy_dashers, Length: 154, dtype: int64
0.0 4110
9.0 2744
10.0 2705
8.0 2685
6.0 2672
...
260.0 1
265.0 1
268.0 1
273.0 1
285.0 1
Name: total_outstanding_orders, Length: 275, dtype: int64
Nope! Zeros are in the data. So, we may need to impute them later. But before that, it is crucial to determien if it is missing completely at random, at random or not at random. We could infer it from the mean duration of obs with missing and non-missing values.
# check the distribution of y for missing values to see if it is (potentially) missing completely at random
for v in ['market_id','store_primary_category','order_protocol','min_item_price',
'total_onshift_dashers','total_busy_dashers',
'total_outstanding_orders','estimated_store_to_consumer_driving_duration']:
missing_sample = df.loc[df[v].isna(),'y']
valid_sample = df.loc[df[v].notna(),'y']
print(f"The proportion of y is {missing_sample.mean():2.2f} when {v} is missing, and {valid_sample.mean():2.2f} otherwise.")
#perform two sample t-test with unequal variances
t_stat, p_val = stats.ttest_ind(missing_sample, valid_sample, equal_var=False,nan_policy='omit')
print(f"The two sample t-test of equal proportion (approximated binomial test): t-stat {t_stat:2.2f} and p-value {p_val:2.2f}\n")
The proportion of y is 0.64 when market_id is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat 1.66 and p-value 0.10 The proportion of y is 0.63 when store_primary_category is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat 1.59 and p-value 0.11 The proportion of y is 0.60 when order_protocol is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat -1.04 and p-value 0.30 The proportion of y is 0.54 when min_item_price is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat -0.55 and p-value 0.59 The proportion of y is 0.61 when total_onshift_dashers is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat -1.63 and p-value 0.10 The proportion of y is 0.61 when total_busy_dashers is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat -1.71 and p-value 0.09 The proportion of y is 0.61 when total_outstanding_orders is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat -1.76 and p-value 0.08 The proportion of y is 0.56 when estimated_store_to_consumer_driving_duration is missing, and 0.62 otherwise. The two sample t-test of equal proportion (approximated binomial test): t-stat -2.60 and p-value 0.01
Summary
y seems unaffected by the missingness of all variables except for
Therefore, we could regard them as either MCAR or MAR, which makes imputation easier.
def create_additional_vars(df):
df['total_free_dasher'] = df['total_onshift_dashers'] - df['total_busy_dashers']
df['total_order_per_onshift_dasher'] = df['total_outstanding_orders']/df['total_onshift_dashers']
df['total_free_dasher_percent'] = df['total_free_dasher']/df['total_onshift_dashers']
df['create_at_date'] = df['created_at'].dt.date
# devided by zero; this is also related to censoring: when onshift_dasher =0, we don't know
# what is the representation of market capacity
df[['total_order_per_onshift_dasher',
'total_free_dasher_percent']] = df[['total_order_per_onshift_dasher',
'total_free_dasher_percent']].replace([np.inf, -np.inf], np.nan)
return df.sort_values('created_at')
df = create_additional_vars(df)
# df_submit = create_additional_vars(df_submit)
Let's see what variables do we have at this step!
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 197418 entries, 43519 to 61787 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 196431 non-null category 1 created_at 197418 non-null datetime64[ns, UTC] 2 actual_delivery_time 197418 non-null datetime64[ns, UTC] 3 store_id 197418 non-null category 4 store_primary_category 192658 non-null category 5 order_protocol 196423 non-null category 6 total_items 197418 non-null int64 7 subtotal 197418 non-null int64 8 num_distinct_items 197418 non-null int64 9 min_item_price 197405 non-null float64 10 max_item_price 197418 non-null int64 11 total_onshift_dashers 181136 non-null float64 12 total_busy_dashers 181136 non-null float64 13 total_outstanding_orders 181113 non-null float64 14 estimated_order_place_duration 197418 non-null int64 15 estimated_store_to_consumer_driving_duration 196892 non-null float64 16 created_at_pst 197418 non-null datetime64[ns, America/Los_Angeles] 17 created_day_of_week 197418 non-null category 18 created_month 197418 non-null category 19 created_hour 197418 non-null category 20 created_hour_pst 197418 non-null category 21 time_slot_pst 197418 non-null category 22 duration 197418 non-null float64 23 y 197418 non-null bool 24 total_free_dasher 181116 non-null float64 25 total_order_per_onshift_dasher 177481 non-null float64 26 total_free_dasher_percent 177505 non-null float64 27 create_at_date 197418 non-null object dtypes: bool(1), category(9), datetime64[ns, America/Los_Angeles](1), datetime64[ns, UTC](2), float64(9), int64(5), object(1) memory usage: 31.0+ MB
df['y'] = 1*df['y']
Here I assume the duration is unknown. We are only interested in y = duration>=2400
fig,ax = plt.subplots(figsize=(8,4),dpi=120)
sns.countplot(x='y',data =df,ax = ax)
ax.set_title(f"The percentage of y=1 is {df['y'].mean():2.2f}")
# ax.set_xscale('log')
plt.tight_layout()
cat_list = ['market_id','order_protocol','created_day_of_week','created_month','created_hour','store_primary_category',
'created_hour_pst','time_slot_pst'] # store_id too many
fig,ax = plt.subplots(int(len(cat_list)/2),2,figsize=(12,1.5*len(cat_list)),dpi=150)
ax = ax.flatten()
for i,arg in enumerate(cat_list):
sns.countplot(x=arg,data = df,ax = ax[i])
ax[i].set_title(f'Count of Records by Categeory of {arg}')
if arg == 'store_primary_category':
ax[i].tick_params(axis='x', rotation=90,labelsize=6)
plt.tight_layout()
Very few orders for protocol 6 and 7.
df.order_protocol.value_counts()
1 54722 3 53196 5 44288 2 24051 4 19353 6 794 7 19 Name: order_protocol, dtype: int64
Duration seems to slightly affected by market_id, created_day_of_week, created_month and order_protocol.
It is higher at created_hour 8, but the later is due to lower available dashers and low sample size.
store_id, hour, and primary category have high cardinality, I will use (cross-validated) target encoding for them.
fig,ax = plt.subplots(int(len(cat_list)/2),2,figsize=(12,1.5*len(cat_list)),dpi=150)
ax = ax.flatten()
for i,arg in enumerate(cat_list):
sns.barplot(y='y',x = arg,data =df.query("created_month!=10"),ax = ax[i])
# ax[i].tick_params(axis='x', rotation=20)
# ax[i].set_ylim((0,df.duration.max()))
ax[i].set_ylabel('Fraction of y=1')
ax[i].set_xlabel(arg)
if arg == 'store_primary_category':
ax[i].tick_params(axis='x', rotation=90,labelsize=6)
# fig.suptitle('Duration in Seconds (outliers omitted)')
plt.tight_layout()
fig,ax = plt.subplots(figsize=(4,3),dpi=120)
sns.boxplot(y='total_free_dasher',x = 'created_hour_pst',data =df[['total_free_dasher','created_hour_pst']],ax = ax,showfliers=False)
ax.tick_params(axis='x', rotation=20)
ax.set_xlabel('created_hour')
plt.tight_layout()
Note that the derived field total_free_dasher and total_free_dasher_percent are sometimes negative. Does this mean the dashers are "over-subscribed"?
df.describe(include='number',percentiles=[.1,.25,.5,.75,.9]).T
| count | mean | std | min | 10% | 25% | 50% | 75% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| total_items | 197418.0 | 3.196375 | 2.666567 | 1.0 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 6.000000 | 411.0 |
| subtotal | 197418.0 | 2682.325629 | 1823.109543 | 0.0 | 995.000000 | 1400.000000 | 2200.000000 | 3395.000000 | 4930.000000 | 27100.0 |
| num_distinct_items | 197418.0 | 2.670780 | 1.630266 | 1.0 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 5.000000 | 20.0 |
| min_item_price | 197405.0 | 686.267916 | 522.027355 | 0.0 | 175.000000 | 299.000000 | 595.000000 | 949.000000 | 1295.000000 | 14700.0 |
| max_item_price | 197418.0 | 1159.587094 | 558.417027 | 0.0 | 599.000000 | 800.000000 | 1095.000000 | 1395.000000 | 1795.000000 | 14700.0 |
| total_onshift_dashers | 181136.0 | 44.812373 | 34.524228 | 0.0 | 7.000000 | 17.000000 | 37.000000 | 65.000000 | 98.000000 | 171.0 |
| total_busy_dashers | 181136.0 | 41.743922 | 32.143572 | 0.0 | 6.000000 | 15.000000 | 34.000000 | 62.000000 | 90.000000 | 154.0 |
| total_outstanding_orders | 181113.0 | 58.062823 | 52.657901 | 0.0 | 7.000000 | 17.000000 | 41.000000 | 85.000000 | 139.000000 | 285.0 |
| estimated_order_place_duration | 197418.0 | 308.560131 | 90.139692 | 0.0 | 251.000000 | 251.000000 | 251.000000 | 446.000000 | 446.000000 | 2715.0 |
| estimated_store_to_consumer_driving_duration | 196892.0 | 545.356993 | 219.354817 | 0.0 | 253.000000 | 382.000000 | 544.000000 | 702.000000 | 833.000000 | 2088.0 |
| duration | 197418.0 | 2861.582323 | 1164.087414 | 101.0 | 1699.000000 | 2104.000000 | 2660.000000 | 3381.000000 | 4235.000000 | 57032.0 |
| y | 197418.0 | 0.617117 | 0.486091 | 0.0 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.0 |
| total_free_dasher | 181116.0 | 3.068718 | 11.421770 | -64.0 | -4.000000 | 0.000000 | 1.000000 | 6.000000 | 14.000000 | 86.0 |
| total_order_per_onshift_dasher | 177481.0 | 1.219184 | 0.469606 | 0.0 | 0.705882 | 0.941176 | 1.200000 | 1.478873 | 1.740741 | 47.0 |
| total_free_dasher_percent | 177505.0 | 0.049558 | 0.401570 | -30.0 | -0.150000 | 0.000000 | 0.037736 | 0.173077 | 0.339623 | 1.0 |
num_list = df.select_dtypes(include=np.number).drop(['duration','y'],axis=1).columns.values
These variables seem very important:
fig,ax = plt.subplots(len(num_list),2,figsize=(10,3*len(num_list)),dpi=120)
for i,arg in enumerate(num_list):
df_plot = df[['y', arg]].dropna()
if arg == 'total_free_dasher_percent':
df_plot = df_plot.query('total_free_dasher_percent>-.15')
sns.boxplot(y=arg,x='y',data = df_plot,ax = ax[i,0],showfliers=False)
sns.histplot(df_plot[arg].dropna(),ax = ax[i,1],kde = True,stat='density')
plt.tight_layout()
There are two groups of highly correlated features (correlation > 0.6):
To avoid singular matrix inversion in linear regression, I will therefore use Ridge regression later to improve numerical stability.
fig,ax = plt.subplots(figsize = (13,13),dpi = 120)
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.heatmap(df[num_list.tolist() + ['y']].dropna().corr(),
cmap = cmap, ax=ax, annot=True, linewidth=0.1,square = True);
I will use the following list to keep track of the variables
candidates = df.drop(['created_hour_pst','created_at_pst','time_slot_pst','duration'],axis=1).columns.values.tolist()
candidates
['market_id', 'created_at', 'actual_delivery_time', 'store_id', 'store_primary_category', 'order_protocol', 'total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price', 'total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_order_place_duration', 'estimated_store_to_consumer_driving_duration', 'created_day_of_week', 'created_month', 'created_hour', 'y', 'total_free_dasher', 'total_order_per_onshift_dasher', 'total_free_dasher_percent', 'create_at_date']
The target encoding code is adapted from this link. Note that I don't want to use TimeSeriesSplit for target encoding, because the latest time series fold is nevered used, and the earlier ones used multiple times.
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,
cv_fold=KFold(n_splits = 5, shuffle = True),
verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.cv_fold = cv_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
col_mean_name = self.colnames + '_' + 'encoded'
X[col_mean_name] = np.nan
for tr_ind, val_ind in self.cv_fold.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames]\
.map(X_tr.groupby(self.colnames)[self.targetName].mean()).astype(float)
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
X[[self.targetName,col_mean_name]].corr().values[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean_of_encoded_target = self.train[self.encodedName].mean()
mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().iloc[:,0].to_dict()
X[self.encodedName] = X[self.colNames].map(mean).astype(float)
X[self.encodedName].fillna(mean_of_encoded_target, inplace = True)
return X
There are three high-cardinality features:
I will use target encoding with CV to encode these variables.
high_card = ['store_id','store_primary_category','created_hour']
The distribution of duration seems wide, so we have some variation.
fig,ax = plt.subplots(len(high_card),1,figsize=(6,3*len(high_card)),dpi=120)
ax = ax.flatten()
for i,arg in enumerate(high_card):
df_plot = df.groupby(arg)['y'].mean().reset_index()
sns.histplot(x='y',data =df_plot,ax = ax[i],kde=True)
ax[i].set_title(f'y Distribution by {arg}')
# ax[i].set_xscale('log')
plt.tight_layout()
Checking the # of orders per category. Using CV target encoding, if a category has low # of obs, it will just be encoded (closer to) the overall average duration.
df.groupby('store_id')['duration'].count().reset_index()\
.rename(columns={'duration':'# of orders per store'}).describe(percentiles=[.1,.25,.5,.75,.9]).T
| count | mean | std | min | 10% | 25% | 50% | 75% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| # of orders per store | 6743.0 | 29.277473 | 56.484772 | 1.0 | 2.0 | 4.0 | 11.0 | 29.0 | 71.8 | 937.0 |
df.groupby('store_primary_category')['duration'].count().reset_index()\
.rename(columns={'duration':'# of orders per primary category'}).describe(percentiles=[.1,.25,.5,.75,.9]).T
| count | mean | std | min | 10% | 25% | 50% | 75% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| # of orders per primary category | 74.0 | 2603.486486 | 4287.071441 | 1.0 | 22.6 | 81.0 | 519.5 | 2728.0 | 8351.0 | 19399.0 |
df.groupby('created_hour')['duration'].count().reset_index()\
.rename(columns={'duration':'# of orders per created_hour'})
| created_hour | # of orders per created_hour | |
|---|---|---|
| 0 | 0 | 12669 |
| 1 | 1 | 28187 |
| 2 | 2 | 36972 |
| 3 | 3 | 27068 |
| 4 | 4 | 15250 |
| 5 | 5 | 7095 |
| 6 | 6 | 1416 |
| 7 | 7 | 11 |
| 8 | 8 | 1 |
| 9 | 14 | 40 |
| 10 | 15 | 538 |
| 11 | 16 | 2109 |
| 12 | 17 | 3413 |
| 13 | 18 | 5100 |
| 14 | 19 | 13541 |
| 15 | 20 | 15560 |
| 16 | 21 | 11464 |
| 17 | 22 | 8821 |
| 18 | 23 | 8163 |
for var in high_card:
targetc = KFoldTargetEncoderTrain(var,'y')
df = targetc.fit_transform(df.sort_values('created_at'))
# test_targetc = KFoldTargetEncoderTest(df,var,var+'_encoded')
# df_submit = test_targetc.fit_transform(df_submit)
Correlation between the new feature, store_id_encoded and, y is 0.26781509669144415. Correlation between the new feature, store_primary_category_encoded and, y is 0.11886541631273892. Correlation between the new feature, created_hour_encoded and, y is 0.2611704491000951.
# for var in high_card:
# print(df_submit[var+'_encoded'].isna().mean())
fig,axes = plt.subplots(len(high_card),1,figsize=(6,4*len(high_card)),dpi=120)
for ax,var in zip(axes, high_card):
df[[var,var+'_encoded']].groupby(var).mean().hist(bins=50,ax=ax,grid=False)
ax.set_title(f"Histogram of Target-Encoded {var}")
plt.tight_layout()
fig,axes = plt.subplots(3,1,figsize=(8,4*3),dpi=120)
for ax,var in zip(axes, ['store_primary_category','created_hour','created_hour']):
df[[var,var+'_encoded']].groupby(var).mean().plot.bar(ax=ax)
ax.set_title(f"Average of Target-Encoded {var}")
plt.tight_layout()
**Feature Engineering 1**
for var in high_card:
candidates.remove(var)
candidates.append(var+"_encoded")
df[candidates].select_dtypes('category').columns.values.tolist()
['market_id', 'order_protocol', 'created_day_of_week', 'created_month']
I can use one-hot encoding, but since lightgbm handles these integer-coded variables well, I will keep them as-is.
The following numeric variables are also measuring market capcity, yet some
One way using dimension reduction to combine them (e.g., PCA). But it only considers linear combination thereof. Here, I will use a different method:
bin_vars = ['total_free_dasher','total_free_dasher_percent']
# corr coeff < 0.1
binned_vars = [var+'_band' for var in bin_vars]
df[bin_vars].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9]).T
| count | mean | std | min | 10% | 20% | 30% | 40% | 50% | 60% | 70% | 80% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| total_free_dasher | 181116.0 | 3.068718 | 11.42177 | -64.0 | -4.00 | -1.000000 | 0.0 | 0.0 | 1.000000 | 3.000000 | 4.000000 | 8.00000 | 14.000000 | 86.0 |
| total_free_dasher_percent | 177505.0 | 0.049558 | 0.40157 | -30.0 | -0.15 | -0.027778 | 0.0 | 0.0 | 0.037736 | 0.076923 | 0.136364 | 0.21875 | 0.339623 | 1.0 |
def band_features(df):
"""Put features into different bands"""
df['total_free_dasher_band'] = np.nan
df['total_free_dasher_percent_band'] = np.nan
df.loc[df.total_free_dasher<=1,'total_free_dasher_band'] = 'leq_1'
df.loc[np.logical_and(df.total_free_dasher>1,df.total_free_dasher<=3),'total_free_dasher_band'] = '1_to_3'
df.loc[np.logical_and(df.total_free_dasher>3,df.total_free_dasher<=5),'total_free_dasher_band'] = '3_to_5'
df.loc[np.logical_and(df.total_free_dasher>5,df.total_free_dasher<=8),'total_free_dasher_band'] = '5_to_8'
df.loc[np.logical_and(df.total_free_dasher>8,df.total_free_dasher<=14),'total_free_dasher_band'] = '8_to_14'
df.loc[df.total_free_dasher>14,'total_free_dasher_band'] = 'gt_14'
df.total_free_dasher_band = pd.Categorical(df.total_free_dasher_band,
ordered=True,
categories=['leq_1','1_to_3','3_to_5','5_to_8','8_to_14','gt_14'])
df.loc[df.total_free_dasher_percent<=0.04,'total_free_dasher_percent_band'] = 'leq_4pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.04,
df.total_free_dasher_percent<=0.08),'total_free_dasher_percent_band'] = '4_to_8pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.08,
df.total_free_dasher_percent<=0.14),'total_free_dasher_percent_band'] = '8_to_14pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.14,
df.total_free_dasher_percent<=0.22),'total_free_dasher_percent_band'] = '14_to_22pct'
df.loc[np.logical_and(df.total_free_dasher_percent>0.22,
df.total_free_dasher_percent<=0.34),'total_free_dasher_percent_band'] = '22_to_34pct'
df.loc[df.total_free_dasher_percent>0.34,'total_free_dasher_percent_band'] = 'gt_34pct'
df.total_free_dasher_percent_band = pd.Categorical(df.total_free_dasher_percent_band,
ordered=True,
categories=['leq_4pct','4_to_8pct',
'8_to_14pct','14_to_22pct','22_to_34pct','gt_34pct'])
# df['total_onshift_dashers_band'] = pd.qcut(df['total_order_per_onshift_dasher'],10)
# df['min_item_price_band'] = pd.qcut(df['min_item_price'],10)
return df
df = band_features(df)
# df_submit = band_features(df_submit)
fig,ax = plt.subplots(len(binned_vars),1,figsize=(5,5*len(binned_vars)),dpi=120)
for i,arg in enumerate(binned_vars):
sns.barplot(y='y',x = arg,data =df,ax = ax[i])
ax[i].tick_params(axis='x', rotation=20)
# ax[i].set_ylim((0,df.duration.max()))
ax[i].set_xlabel(arg)
fig.suptitle('y=1 percentage')
plt.tight_layout()
There are some ordering relationship between them and duration. In this case, I will use target encoding again for help ML learn this feature
for var in binned_vars:
targetc = KFoldTargetEncoderTrain(var,'y')
df = targetc.fit_transform(df.sort_values('created_at'))
# test_targetc = KFoldTargetEncoderTest(df,var,var+'_encoded')
# df_submit = test_targetc.fit_transform(df_submit)
Correlation between the new feature, total_free_dasher_band_encoded and, y is 0.0932987775881724. Correlation between the new feature, total_free_dasher_percent_band_encoded and, y is 0.1530017246623592.
fig,axes = plt.subplots(len(binned_vars),1,figsize=(8,4*len(binned_vars)),dpi=120)
for ax,var in zip(axes, binned_vars):
df[[var,var+'_encoded']].groupby(var).mean().plot.bar(ax=ax)
ax.set_title(f"Average of Target-Encoded {var}")
plt.tight_layout()
And we also solved for their missing values problem! Nice!
df[[x+'_encoded' for x in binned_vars]].isna().mean()
total_free_dasher_band_encoded 0.0 total_free_dasher_percent_band_encoded 0.0 dtype: float64
# df_submit[[x+'_encoded' for x in binned_vars]].isna().mean()
**Feature Engineering 2**
for var in binned_vars:
candidates.append(var+"_encoded")
for var in bin_vars:
candidates.remove(var)
Market ID is an important identifier, and will affect subsequent analysis. For simplicity, I will fill it with the most frequent category. This is ok, because less than 0.5% of market id is missing, and there is not difference in average duration.
df[candidates].isna().mean()*100
market_id 0.499954 created_at 0.000000 actual_delivery_time 0.000000 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 8.247475 total_busy_dashers 8.247475 total_outstanding_orders 8.259125 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.266440 created_day_of_week 0.000000 created_month 0.000000 y 0.000000 total_order_per_onshift_dasher 10.098876 create_at_date 0.000000 store_id_encoded 0.000000 store_primary_category_encoded 0.000000 created_hour_encoded 0.000000 total_free_dasher_band_encoded 0.000000 total_free_dasher_percent_band_encoded 0.000000 dtype: float64
def fill_mode(df,var):
df = df.copy()
df[var].fillna(df[var].mode().iloc[0],inplace=True)
return df
df = fill_mode(df,'market_id')
# df_submit = fill_mode(df_submit,'market_id')
The only variables with significant percentage missing are 'total_onshift_dashers',total_busy_dashers','total_outstanding_orders and 'total_order_per_onshift_dasher'.
The rest I will just left for lightgbm to handle.
I will use the IterativeImputer in Scikit-Learn. However, I need to one-hot encode the remaining cat variables first.
drop_list=['y','duration','created_at','actual_delivery_time','create_at_date','created_month'] # since the test set only has one month
cats = ['market_id', 'order_protocol', 'created_day_of_week','time_slot_pst']
to_impute = ['total_busy_dashers','total_outstanding_orders','total_order_per_onshift_dasher','total_onshift_dashers',
'estimated_store_to_consumer_driving_duration']
def impute_missing(df,cats,candidates,drop_list,to_impute,estimator=None,imputer=None):
"""
Use iterative imputer to impute missing values
cats: list of categorical var names
candidates: all relevant vars
drop_list: vars to drop before passing to imputation
to_impute: list of vars to impute
"""
from sklearn.linear_model import BayesianRidge
df=df.copy()
## create dummy variables
df_dummy = pd.concat([df[candidates].drop(cats+drop_list,axis=1,errors='ignore'),
pd.get_dummies(df[cats],drop_first = False,dummy_na=True)],axis=1)
if imputer is None:
if estimator is None:
# estimator = lgb.LGBMRegressor(random_state=7)
estimator = BayesianRidge(normalize=True)
imputer = IterativeImputer(random_state=11, estimator=estimator,skip_complete=True)
imputer.fit(df_dummy)
X_new = imputer.transform(df_dummy)
for i,var in enumerate(df_dummy.columns):
if var in to_impute:
df[var] = np.where(df[var].isna(),X_new[:,i],df[var])
return df,imputer
df_imputed,imputer = impute_missing(df,cats,candidates,drop_list,to_impute,
estimator=lgb.LGBMRegressor(random_state=7),
imputer=None)
df_imputed[to_impute].hist();
df[to_impute].hist();
df_imputed[to_impute].describe()
| total_busy_dashers | total_outstanding_orders | total_order_per_onshift_dasher | total_onshift_dashers | estimated_store_to_consumer_driving_duration | |
|---|---|---|---|---|---|
| count | 197418.000000 | 197418.000000 | 197418.000000 | 197418.000000 | 197418.000000 |
| mean | 41.340929 | 58.560071 | 1.232160 | 44.519496 | 545.343713 |
| std | 30.825691 | 50.486223 | 0.481114 | 33.092971 | 219.071093 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 17.000000 | 19.000000 | 0.953125 | 19.000000 | 383.000000 |
| 50% | 36.686763 | 47.000000 | 1.238095 | 40.000000 | 544.000000 |
| 75% | 59.000000 | 80.000000 | 1.537313 | 62.000000 | 702.000000 |
| max | 154.000000 | 285.000000 | 47.000000 | 171.000000 | 2088.000000 |
df[to_impute].describe()
| total_busy_dashers | total_outstanding_orders | total_order_per_onshift_dasher | total_onshift_dashers | estimated_store_to_consumer_driving_duration | |
|---|---|---|---|---|---|
| count | 181136.000000 | 181113.000000 | 177481.000000 | 181136.000000 | 196892.000000 |
| mean | 41.743922 | 58.062823 | 1.219184 | 44.812373 | 545.356993 |
| std | 32.143572 | 52.657901 | 0.469606 | 34.524228 | 219.354817 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 15.000000 | 17.000000 | 0.941176 | 17.000000 | 382.000000 |
| 50% | 34.000000 | 41.000000 | 1.200000 | 37.000000 | 544.000000 |
| 75% | 62.000000 | 85.000000 | 1.478873 | 65.000000 | 702.000000 |
| max | 154.000000 | 285.000000 | 47.000000 | 171.000000 | 2088.000000 |
# df_submit_imputed,_ = impute_missing(df_submit,cats,
# candidates=list(set(candidates)-set(['actual_delivery_time', 'duration'])),
# drop_list=drop_list+['delivery_id','platform'],
# to_impute=to_impute,imputer=imputer)
# df_imputed[candidates].isna().mean()*100
df = df_imputed.copy()
# df_submit = df_submit_imputed.copy()
del df_imputed #,df_submit_imputed
Time series data usually have high auto-correlation. Historical averages are thus very useful.
calucalte moving averages in the last week by market_id and time_slot
df['created_week_pst'] = df['created_at_pst'].dt.week
# df_submit['created_week_pst'] = df_submit['created_at_pst'].dt.week
df['created_month_pst'] = df['created_at_pst'].dt.month
# df_submit['created_month_pst'] = df_submit['created_at_pst'].dt.month
seg_vars = ['market_id','time_slot_pst']
order_vars = ['created_week_pst']
# combine all the date time
df_agg = df[seg_vars+order_vars+['y']].groupby(seg_vars+order_vars).agg(mean_duration=('y','mean')).reset_index()
# extend one more week
# df_temp = df_submit[seg_vars+order_vars].copy()
# df_temp['created_week_pst'] += 1
# df_all = pd.concat([df[seg_vars+order_vars],df_submit[seg_vars+order_vars],df_temp]).drop_duplicates().reset_index(drop=True)\
# .merge(df_agg,how='left',validate="1:1")
df_all = df[seg_vars+order_vars].drop_duplicates().reset_index(drop=True)\
.merge(df_agg,how='left',validate="1:1")
def fill_mean_duration(df_all,seg_vars=None,order_vars=None,forward_fill = True):
"""
This function fills the missing values of mean y
"""
if seg_vars is None:
seg_vars = ['market_id','time_slot_pst']
if order_vars is None:
order_vars = ['created_week_pst']
df_filled = df_all.copy()
if forward_fill:
# forward fill by segment
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars).groupby(seg_vars)\
['mean_duration'].transform(lambda x: x.ffill())
else:
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars).groupby(seg_vars)\
['mean_duration'].transform(lambda x: x.bfill())
# if missing, fill by average duration for that time_slot
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars)\
.groupby(['time_slot_pst'])['mean_duration'].transform(lambda x: x.fillna(x.mean()))
# if missing, fill by average duration for that market
df_filled['mean_duration'] = df_filled.sort_values(seg_vars+order_vars)\
.groupby(['market_id'])['mean_duration'].transform(lambda x: x.fillna(x.mean()))
return df_filled
df_filled = fill_mean_duration(df_all,seg_vars,order_vars,forward_fill = True)
df_filled[df_filled.market_id==1].sort_values(seg_vars+order_vars)
| market_id | time_slot_pst | created_week_pst | mean_duration | |
|---|---|---|---|---|
| 0 | 1 | (-0.001, 12.0] | 4 | 0.510669 |
| 36 | 1 | (-0.001, 12.0] | 5 | 0.635383 |
| 72 | 1 | (-0.001, 12.0] | 6 | 0.607607 |
| 109 | 1 | (-0.001, 12.0] | 7 | 0.657767 |
| 144 | 1 | (-0.001, 12.0] | 8 | 0.708015 |
| 8 | 1 | (12.0, 15.0] | 4 | 0.540315 |
| 42 | 1 | (12.0, 15.0] | 5 | 0.617602 |
| 81 | 1 | (12.0, 15.0] | 6 | 0.560976 |
| 114 | 1 | (12.0, 15.0] | 7 | 0.578720 |
| 152 | 1 | (12.0, 15.0] | 8 | 0.591195 |
| 14 | 1 | (15.0, 17.0] | 4 | 0.705796 |
| 49 | 1 | (15.0, 17.0] | 5 | 0.720000 |
| 89 | 1 | (15.0, 17.0] | 6 | 0.705331 |
| 121 | 1 | (15.0, 17.0] | 7 | 0.716923 |
| 160 | 1 | (15.0, 17.0] | 8 | 0.640523 |
| 22 | 1 | (17.0, 18.0] | 4 | 0.694060 |
| 54 | 1 | (17.0, 18.0] | 5 | 0.801734 |
| 92 | 1 | (17.0, 18.0] | 6 | 0.802428 |
| 129 | 1 | (17.0, 18.0] | 7 | 0.819058 |
| 164 | 1 | (17.0, 18.0] | 8 | 0.747642 |
| 25 | 1 | (18.0, 19.0] | 4 | 0.579053 |
| 61 | 1 | (18.0, 19.0] | 5 | 0.705107 |
| 96 | 1 | (18.0, 19.0] | 6 | 0.786620 |
| 132 | 1 | (18.0, 19.0] | 7 | 0.782119 |
| 172 | 1 | (18.0, 19.0] | 8 | 0.751429 |
| 33 | 1 | (19.0, 23.0] | 4 | 0.336766 |
| 66 | 1 | (19.0, 23.0] | 5 | 0.501324 |
| 105 | 1 | (19.0, 23.0] | 6 | 0.588131 |
| 141 | 1 | (19.0, 23.0] | 7 | 0.559862 |
| 177 | 1 | (19.0, 23.0] | 8 | 0.550877 |
df_all[df_filled.market_id==1].sort_values(seg_vars+order_vars)
| market_id | time_slot_pst | created_week_pst | mean_duration | |
|---|---|---|---|---|
| 0 | 1 | (-0.001, 12.0] | 4 | 0.510669 |
| 36 | 1 | (-0.001, 12.0] | 5 | 0.635383 |
| 72 | 1 | (-0.001, 12.0] | 6 | 0.607607 |
| 109 | 1 | (-0.001, 12.0] | 7 | 0.657767 |
| 144 | 1 | (-0.001, 12.0] | 8 | 0.708015 |
| 8 | 1 | (12.0, 15.0] | 4 | 0.540315 |
| 42 | 1 | (12.0, 15.0] | 5 | 0.617602 |
| 81 | 1 | (12.0, 15.0] | 6 | 0.560976 |
| 114 | 1 | (12.0, 15.0] | 7 | 0.578720 |
| 152 | 1 | (12.0, 15.0] | 8 | 0.591195 |
| 14 | 1 | (15.0, 17.0] | 4 | 0.705796 |
| 49 | 1 | (15.0, 17.0] | 5 | 0.720000 |
| 89 | 1 | (15.0, 17.0] | 6 | 0.705331 |
| 121 | 1 | (15.0, 17.0] | 7 | 0.716923 |
| 160 | 1 | (15.0, 17.0] | 8 | 0.640523 |
| 22 | 1 | (17.0, 18.0] | 4 | 0.694060 |
| 54 | 1 | (17.0, 18.0] | 5 | 0.801734 |
| 92 | 1 | (17.0, 18.0] | 6 | 0.802428 |
| 129 | 1 | (17.0, 18.0] | 7 | 0.819058 |
| 164 | 1 | (17.0, 18.0] | 8 | 0.747642 |
| 25 | 1 | (18.0, 19.0] | 4 | 0.579053 |
| 61 | 1 | (18.0, 19.0] | 5 | 0.705107 |
| 96 | 1 | (18.0, 19.0] | 6 | 0.786620 |
| 132 | 1 | (18.0, 19.0] | 7 | 0.782119 |
| 172 | 1 | (18.0, 19.0] | 8 | 0.751429 |
| 33 | 1 | (19.0, 23.0] | 4 | 0.336766 |
| 66 | 1 | (19.0, 23.0] | 5 | 0.501324 |
| 105 | 1 | (19.0, 23.0] | 6 | 0.588131 |
| 141 | 1 | (19.0, 23.0] | 7 | 0.559862 |
| 177 | 1 | (19.0, 23.0] | 8 | 0.550877 |
Now merge with previous data
df_filled.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 180 entries, 0 to 179 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 market_id 180 non-null category 1 time_slot_pst 180 non-null category 2 created_week_pst 180 non-null int64 3 mean_duration 180 non-null float64 dtypes: category(2), float64(1), int64(1) memory usage: 5.0 KB
df_filled['created_week_pst'] +=1 # last week so week 4 in df_filled is matched with 5 in df
df2 = df.merge(df_filled,on=seg_vars + order_vars , suffixes=[False,False],validate = "m:1",how='left')
# df_submit2 = df_submit.merge(df_filled,on=seg_vars + order_vars , suffixes=[False,False],validate = "m:1",how='left')
**Feature Engineering 3**
candidates.append('mean_duration')
df = fill_mean_duration(df2,seg_vars,order_vars,forward_fill = False)
# df_submit = fill_mean_duration(df_submit2,seg_vars,order_vars,forward_fill = False)
# df_submit.isna().mean()*100
df.isna().mean()*100
market_id 0.000000 created_at 0.000000 actual_delivery_time 0.000000 store_id 0.000000 store_primary_category 2.411128 order_protocol 0.504007 total_items 0.000000 subtotal 0.000000 num_distinct_items 0.000000 min_item_price 0.006585 max_item_price 0.000000 total_onshift_dashers 0.000000 total_busy_dashers 0.000000 total_outstanding_orders 0.000000 estimated_order_place_duration 0.000000 estimated_store_to_consumer_driving_duration 0.000000 created_at_pst 0.000000 created_day_of_week 0.000000 created_month 0.000000 created_hour 0.000000 created_hour_pst 0.000000 time_slot_pst 0.000000 duration 0.000000 y 0.000000 total_free_dasher 8.257606 total_order_per_onshift_dasher 0.000000 total_free_dasher_percent 10.086720 create_at_date 0.000000 store_id_encoded 0.000000 store_primary_category_encoded 0.000000 created_hour_encoded 0.000000 total_free_dasher_band 8.257606 total_free_dasher_percent_band 10.086720 total_free_dasher_band_encoded 0.000000 total_free_dasher_percent_band_encoded 0.000000 created_week_pst 0.000000 created_month_pst 0.000000 mean_duration 0.000000 dtype: float64
I will use two sets of models: one is linear quantile regression, and the other is gradient boosting.
df_with_dummy = pd.concat([df[candidates],
pd.get_dummies(df[['market_id','order_protocol','created_day_of_week','created_month','time_slot_pst']],
drop_first = True,dummy_na=True)],axis=1)
df_with_dummy.fillna(df_with_dummy.mean(),inplace=True)
df_with_dummy.drop(['market_id','order_protocol','created_day_of_week'
,'actual_delivery_time','created_at','create_at_date','time_slot_pst'],axis=1,
inplace=True,errors='ignore')
df_with_dummy.dropna(inplace=True)
X_dummies = df_with_dummy.drop(['y'],axis=1)
y_dummies = df_with_dummy['y']
ridge_reg = LogisticRegressionCV(Cs=40,penalty='l2',scoring = 'neg_log_loss', # since we care about prob
class_weight = 'balanced',cv = TimeSeriesSplit(5)).fit(X_dummies, y_dummies)
ridge_reg.C_
array([0.04641589])
ridge_reg.scores_[1].mean()
-0.585110686101969
ridge_reg_res = pd.DataFrame(data={'feature':['intercept'] + X_dummies.columns.values.tolist(),
'coef':ridge_reg.intercept_.tolist()+ridge_reg.coef_.ravel().tolist()}).set_index('feature')
ridge_reg_res
| coef | |
|---|---|
| feature | |
| intercept | -0.748020 |
| total_items | -0.014637 |
| subtotal | 0.000238 |
| num_distinct_items | 0.007414 |
| min_item_price | -0.000062 |
| max_item_price | 0.000032 |
| total_onshift_dashers | -0.042812 |
| total_busy_dashers | -0.010699 |
| total_outstanding_orders | 0.040150 |
| estimated_order_place_duration | -0.000589 |
| estimated_store_to_consumer_driving_duration | 0.002353 |
| created_month | -0.412766 |
| total_order_per_onshift_dasher | 0.134960 |
| store_id_encoded | 0.821823 |
| store_primary_category_encoded | -0.266444 |
| created_hour_encoded | -0.004743 |
| total_free_dasher_band_encoded | -0.319562 |
| total_free_dasher_percent_band_encoded | -0.268013 |
| mean_duration | 0.029199 |
| market_id_2 | -0.194632 |
| market_id_3 | -0.100212 |
| market_id_4 | -0.130758 |
| market_id_5 | -0.030054 |
| market_id_6 | -0.427935 |
| market_id_nan | 0.000000 |
| order_protocol_2 | -0.268343 |
| order_protocol_3 | -0.439218 |
| order_protocol_4 | 0.004820 |
| order_protocol_5 | -0.521719 |
| order_protocol_6 | 0.066960 |
| order_protocol_7 | -0.000784 |
| order_protocol_nan | -0.022394 |
| created_day_of_week_2.0 | -0.130957 |
| created_day_of_week_3.0 | -0.296114 |
| created_day_of_week_4.0 | -0.161863 |
| created_day_of_week_5.0 | -0.172609 |
| created_day_of_week_6.0 | 0.115242 |
| created_day_of_week_7.0 | 0.010142 |
| created_day_of_week_nan | 0.000000 |
| created_month_2.0 | 0.250386 |
| created_month_nan | 0.000000 |
| time_slot_pst_(12.0, 15.0] | -0.293682 |
| time_slot_pst_(15.0, 17.0] | 0.164950 |
| time_slot_pst_(17.0, 18.0] | 0.602281 |
| time_slot_pst_(18.0, 19.0] | 0.187061 |
| time_slot_pst_(19.0, 23.0] | -0.508972 |
| time_slot_pst_nan | 0.000000 |
I suspect the default parameters are good enough. Nonetheless, I will use Bayesian Optimization to tune the hyperparameters. I will only tune the most important ones.
I first use early stopping to automatically select num_iterations, and focus on the other parameters. This will ususally speeds us tuning.
# df_train[candidates]
X = df.sort_values('created_at')[candidates].drop(['y','actual_delivery_time','created_at','create_at_date'], axis=1).copy()
y = df.sort_values('created_at')['y'].copy()
lgb_data = lgb.Dataset(X, y, free_raw_data=False)
categorical_features = X.columns.values[np.where(X.dtypes == 'category')[0]]
print(categorical_features)
['market_id' 'order_protocol' 'created_day_of_week' 'created_month']
# num_iterations will be chosen by early stopping
def lgb_tuning(learning_rate=0.1, num_leaves=34, min_data_in_leaf=500,num_boost_round=500,early_stopping_rounds=18):
params = {
'num_leaves': int(num_leaves),
'learning_rate': learning_rate,
"min_data_in_leaf": int(min_data_in_leaf),
'verbose': -1,
'saved_feature_importance_type':1,
'metric': "binary_logloss",
'objective':'binary',
# 'is_unbalance':True #set to false, because interested in individual class probability
}
cv_res = lgb.cv(params,lgb_data,num_boost_round=int(num_boost_round),
folds = TimeSeriesSplit(5),
early_stopping_rounds=early_stopping_rounds,
categorical_feature = categorical_features.tolist(),
verbose_eval=False)
return -cv_res['binary_logloss-mean'][-1]
lgb_bo = BayesianOptimization(lgb_tuning, { "num_leaves": (20, 100),
'min_data_in_leaf': (10, 1000),
'learning_rate': (0.1, 0.5)},
random_state = 520)
%%time
lgb_bo.maximize(n_iter=50, init_points=10)
| iter | target | learni... | min_da... | num_le... | ------------------------------------------------------------- | 1 | -0.52 | 0.2768 | 215.6 | 21.17 | | 2 | -0.5199 | 0.2774 | 658.8 | 44.98 | | 3 | -0.5171 | 0.1238 | 380.1 | 36.39 | | 4 | -0.5178 | 0.1833 | 985.5 | 80.76 | | 5 | -0.5201 | 0.2803 | 64.34 | 23.65 | | 6 | -0.5161 | 0.1068 | 38.11 | 57.19 | | 7 | -0.5213 | 0.3346 | 599.5 | 37.25 | | 8 | -0.5198 | 0.2752 | 31.87 | 59.44 | | 9 | -0.518 | 0.2116 | 218.2 | 41.92 | | 10 | -0.5183 | 0.2118 | 959.6 | 52.3 | | 11 | -0.5225 | 0.4053 | 37.23 | 56.92 | | 12 | -0.523 | 0.456 | 99.0 | 24.6 | | 13 | -0.5217 | 0.3729 | 165.8 | 56.05 | | 14 | -0.5251 | 0.4177 | 11.93 | 87.6 | | 15 | -0.5189 | 0.2626 | 733.9 | 52.46 | | 16 | -0.5179 | 0.2242 | 838.5 | 58.77 | | 17 | -0.519 | 0.3054 | 528.5 | 33.51 | | 18 | -0.5162 | 0.1403 | 293.8 | 82.45 | | 19 | -0.5177 | 0.2145 | 787.8 | 62.0 | | 20 | -0.5178 | 0.2118 | 370.5 | 50.91 | | 21 | -0.5159 | 0.1261 | 884.7 | 87.9 | | 22 | -0.5231 | 0.4977 | 933.8 | 38.88 | | 23 | -0.5158 | 0.1078 | 817.2 | 65.98 | | 24 | -0.516 | 0.1098 | 508.4 | 57.69 | | 25 | -0.518 | 0.1978 | 847.1 | 24.18 | | 26 | -0.5205 | 0.3243 | 316.7 | 68.63 | | 27 | -0.5209 | 0.3931 | 812.9 | 66.42 | | 28 | -0.5235 | 0.4339 | 707.9 | 39.6 | | 29 | -0.5185 | 0.2791 | 213.9 | 27.48 | | 30 | -0.5209 | 0.3355 | 985.6 | 80.69 | | 31 | -0.5194 | 0.2994 | 625.6 | 45.61 | | 32 | -0.5177 | 0.1927 | 261.9 | 75.1 | | 33 | -0.5173 | 0.1832 | 439.0 | 57.5 | | 34 | -0.5179 | 0.2026 | 369.6 | 70.85 | | 35 | -0.5176 | 0.2673 | 724.0 | 31.41 | | 36 | -0.5222 | 0.3576 | 190.4 | 46.88 | | 37 | -0.524 | 0.4747 | 443.0 | 86.32 | | 38 | -0.5235 | 0.4821 | 531.1 | 77.43 | | 39 | -0.5225 | 0.4307 | 878.7 | 67.2 | | 40 | -0.5234 | 0.4607 | 956.9 | 56.25 | | 41 | -0.5185 | 0.2715 | 795.6 | 69.06 | | 42 | -0.5193 | 0.2686 | 808.1 | 96.79 | | 43 | -0.5167 | 0.145 | 525.0 | 97.21 | | 44 | -0.5162 | 0.1295 | 871.7 | 52.02 | | 45 | -0.525 | 0.4932 | 384.8 | 93.92 | | 46 | -0.5194 | 0.2646 | 739.2 | 87.8 | | 47 | -0.5174 | 0.1056 | 62.97 | 84.52 | | 48 | -0.5212 | 0.3756 | 874.6 | 70.91 | | 49 | -0.5161 | 0.1553 | 402.2 | 97.44 | | 50 | -0.5196 | 0.3026 | 322.8 | 41.52 | | 51 | -0.5205 | 0.4364 | 763.0 | 61.7 | | 52 | -0.5159 | 0.1691 | 529.5 | 93.96 | | 53 | -0.5221 | 0.4102 | 902.6 | 47.74 | | 54 | -0.518 | 0.1698 | 328.4 | 92.45 | | 55 | -0.5204 | 0.2446 | 540.3 | 37.62 | | 56 | -0.5164 | 0.1105 | 327.1 | 25.55 | | 57 | -0.5214 | 0.3257 | 403.7 | 43.35 | | 58 | -0.5166 | 0.1258 | 687.5 | 23.23 | | 59 | -0.5226 | 0.4517 | 603.5 | 69.66 | | 60 | -0.517 | 0.1983 | 584.9 | 78.44 | ============================================================= CPU times: user 19min 21s, sys: 5min 19s, total: 24min 40s Wall time: 2min 9s
n = len(lgb_bo.res)
df_bo = pd.DataFrame({'target':[lgb_bo.res[i]['target'] for i in range(n)],
'learning_rate':[lgb_bo.res[i]['params']['learning_rate'] for i in range(n)],
'min_data_in_leaf':[lgb_bo.res[i]['params']['min_data_in_leaf'] for i in range(n)],
'num_leaves':[lgb_bo.res[i]['params']['num_leaves'] for i in range(n)]})
df_bo['cum_min'] = [np.min(-df_bo['target'][:i+1]) for i in range(n)]
fig,ax = plt.subplots(figsize = (8,6),dpi=120)
ax.plot(np.arange(n),df_bo['cum_min'],'b.')
ax.plot(np.arange(n),df_bo['cum_min'])
ax.set_xlabel("Number of calls n")
ax.set_ylabel("minimum cost function after n calls")
ax.set_title("Convergence Plot");
lgb_bo.max
{'target': -0.5158344981596148,
'params': {'learning_rate': 0.1078034154456137,
'min_data_in_leaf': 817.1510631103223,
'num_leaves': 65.984789473368}}
lgb_best_param = lgb_bo.max['params']
Given the other parameters, pin down num_iterations (univariate search)
lgb_bo = BayesianOptimization(partial(lgb_tuning,**lgb_best_param,early_stopping_rounds=None), { "num_boost_round": (10, 100)},
random_state = 521)
%%time
lgb_bo.maximize(n_iter=40, init_points=10)
| iter | target | num_bo... | ------------------------------------- | 1 | -0.5162 | 73.56 | | 2 | -0.516 | 61.31 | | 3 | -0.5167 | 92.35 | | 4 | -0.5167 | 92.21 | | 5 | -0.5161 | 49.77 | | 6 | -0.5166 | 45.97 | | 7 | -0.5164 | 76.2 | | 8 | -0.5297 | 20.33 | | 9 | -0.5164 | 47.98 | | 10 | -0.5162 | 72.97 | | 11 | -0.517 | 100.0 | | 12 | -0.5158 | 55.83 | | 13 | -0.5176 | 38.61 | | 14 | -0.516 | 66.03 | | 15 | -0.5166 | 83.87 | | 16 | -0.5159 | 53.1 | | 17 | -0.5159 | 58.35 | | 18 | -0.516 | 63.89 | | 19 | -0.516 | 68.53 | | 20 | -0.5159 | 56.74 | | 21 | -0.5158 | 54.71 | | 22 | -0.5158 | 55.22 | | 23 | -0.5165 | 80.28 | | 24 | -0.5168 | 42.36 | | 25 | -0.516 | 59.74 | | 26 | -0.5168 | 87.58 | | 27 | -0.5158 | 54.21 | | 28 | -0.5158 | 54.44 | | 29 | -0.5158 | 54.44 | | 30 | -0.516 | 67.25 | | 31 | -0.5158 | 54.42 | | 32 | -0.5158 | 54.3 | | 33 | -0.5158 | 54.51 | | 34 | -0.5161 | 70.2 | | 35 | -0.5158 | 54.45 | | 36 | -0.5158 | 54.26 | | 37 | -0.5158 | 54.46 | | 38 | -0.5158 | 54.37 | | 39 | -0.5158 | 54.5 | | 40 | -0.5158 | 54.28 | | 41 | -0.5158 | 54.45 | | 42 | -0.5158 | 54.36 | | 43 | -0.5158 | 54.48 | | 44 | -0.5158 | 54.28 | | 45 | -0.5158 | 54.42 | | 46 | -0.5158 | 54.37 | | 47 | -0.5158 | 54.52 | | 48 | -0.5158 | 54.32 | | 49 | -0.5158 | 54.43 | | 50 | -0.5158 | 54.41 | ===================================== CPU times: user 19min 11s, sys: 10min 12s, total: 29min 23s Wall time: 2min 51s
n = len(lgb_bo.res)
df_bo = pd.DataFrame({'target':[lgb_bo.res[i]['target'] for i in range(n)],
'num_boost_round':[lgb_bo.res[i]['params']['num_boost_round'] for i in range(n)]})
df_bo['cum_min'] = [np.min(-df_bo['target'][:i+1]) for i in range(n)]
fig,ax = plt.subplots(figsize = (8,6),dpi=120)
ax.plot(np.arange(n),df_bo['cum_min'],'b.')
ax.plot(np.arange(n),df_bo['cum_min'])
ax.set_xlabel("Number of calls n")
ax.set_ylabel("minimum cost function after n calls")
ax.set_title("Convergence Plot");
lgb_bo.max
{'target': -0.5158344981596148,
'params': {'num_boost_round': 54.70840916891798}}
lgb_best_param.update(num_boost_round=int(lgb_bo.max['params']['num_boost_round']))
lgb_best_param
{'learning_rate': 0.1078034154456137,
'min_data_in_leaf': 817.1510631103223,
'num_leaves': 65.984789473368,
'num_boost_round': 54}
df_cv = df_with_dummy.copy().reset_index()
df_cv['predicted_y'] = np.nan
df_cv['predicted_y_proba'] = np.nan
for train_fold,cv_fold in TimeSeriesSplit(5).split(df_cv):
df_train_fold = df_cv.iloc[train_fold,:]
df_val_fold = df_cv.iloc[cv_fold,:]
linear_qreg = LogisticRegression(C=ridge_reg.C_[0]).fit(df_train_fold.drop(['y','predicted_y','predicted_y_proba'],axis=1),
df_train_fold['y'])
df_cv.loc[cv_fold,'predicted_y'] = linear_qreg.predict(df_val_fold.drop(['y','predicted_y','predicted_y_proba'],axis=1))
df_cv.loc[cv_fold,'predicted_y_proba'] = linear_qreg.predict_proba(df_val_fold.drop(['y','predicted_y','predicted_y_proba'],axis=1))[:,1]
def lgb_cv_oos_prediction(params,df,candidates,categorical_features,fobj=None,feval=None):
"""
This function makes out-of-sample prediction for light gbm models
Inputs:
paras: dictionary of parameters to pass to lgb.train
df: data frame
candidates: list of relevant variables (label+feature)
categorical_features: list of str (names of categorical features)
fobj: callable; objective function
feval: callable; evaluation metrics
early_stopping_rounds: int or None
outputs:
df_lb: a dataframe with predicted duration
"""
df_lb = df[candidates].copy().reset_index()
for train_fold,cv_fold in TimeSeriesSplit(5).split(df_lb):
fold_X_train=df_lb.drop(['y','actual_delivery_time','created_at','create_at_date'], axis=1).iloc[train_fold,:]
fold_y_train = df_lb.loc[train_fold,'y']
fold_X_val= df_lb.drop(['y','actual_delivery_time','created_at','create_at_date'], axis=1).iloc[cv_fold,:]
fold_y_val = df_lb.loc[cv_fold,'y']
lgb_train = lgb.Dataset(fold_X_train, fold_y_train)
lgb_cv = lgb.Dataset(fold_X_val, fold_y_val)
lbm_model = lgb.train(params,lgb_train,
# fobj=fobj,
# feval=feval,
# num_boost_round=500,
valid_sets=lgb_cv,
# early_stopping_rounds=early_stopping_rounds,
categorical_feature = categorical_features.tolist(),
verbose_eval=False)
df_lb.loc[cv_fold,'predicted_y_proba'] = lbm_model.predict(fold_X_val)
df_lb.loc[cv_fold,'predicted_y'] = 1*(df_lb.loc[cv_fold,'predicted_y_proba']>=0.5)
return df_lb
params_gold = {
'num_leaves': int(lgb_best_param['num_leaves']),
'learning_rate': lgb_best_param['learning_rate'],
"min_data_in_leaf": int(lgb_best_param['min_data_in_leaf']),
"num_boost_round":int(lgb_best_param['num_boost_round']),
'early_stopping_rounds':None,
'verbose': -1,
'saved_feature_importance_type':1,
'metric': "binary_logloss",
'objective':'binary'
}
df_gold = lgb_cv_oos_prediction(params_gold,df.sort_values('created_at'),candidates,categorical_features)
Just based on binary outcome, not probability
def cm_analysis(y_true, y_pred, labels, ax,ymap=None):
"""
Generate matrix plot of confusion matrix with pretty annotations.
The plot image is saved to disk.
args:
y_true: true label of the data, with shape (nsamples,)
y_pred: prediction of the data, with shape (nsamples,)
filename: filename of figure file to save
labels: string array, name the order of class labels in the confusion matrix.
use `clf.classes_` if using scikit-learn models.
with shape (nclass,).
ymap: dict: any -> string, length == nclass.
if not None, map the labels & ys to more understandable strings.
Caution: original y_true, y_pred and labels must align.
"""
if ymap is not None:
y_pred = [ymap[yi] for yi in y_pred]
y_true = [ymap[yi] for yi in y_true]
labels = [ymap[yi] for yi in labels]
cm = confusion_matrix(y_true, y_pred, labels=labels)
accuracy = np.trace(cm) / np.sum(cm).astype('float')
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
for j in range(ncols):
c = cm[i, j]
p = cm_perc[i, j]
if i == j:
s = cm_sum[i]
annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
elif c == 0:
annot[i, j] = ''
else:
annot[i, j] = '%.1f%%\n%d' % (p, c)
cm = pd.DataFrame(cm, index=labels, columns=labels)
cm.index.name = 'Actual'
cm.columns.name = 'Predicted \naccuracy={:0.4f}'.format(accuracy)
sns.heatmap(cm, annot=annot, fmt='', ax=ax,
square = True,cmap=plt.cm.Blues,
cbar_kws={'shrink':0.8},annot_kws={"fontsize":12}
)
return ax
fig,axes = plt.subplots(1,2,figsize = (8,4),dpi=120)
for ax,df_plot,name in zip(axes,[df_cv,df_gold],['Logistic','LGB']):
mask = df_plot['y'].notna()&df_plot['predicted_y'].notna()
y_true = df_plot['y'][mask]
y_pred = df_plot['predicted_y'][mask]
labels = [0,1]
ax = cm_analysis(y_true, y_pred, labels, ax,ymap=None)
ax.set_title(name)
plt.tight_layout()
def binary_plotting(y_true,y_scores,name = None,ax=None):
"""
Plot ROC and PR curve
args:
y_scores: predicted probability
y_true: true label of the data, with shape (nsamples,)
name: name of the classifier
ax: length of 2
"""
if ax is None:
fig,ax = plt.subplots(1,2,figsize = (12,6),dpi = 120)
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
pr_auc = auc(recall,precision)
ap = average_precision_score(y_true, y_scores,average='macro')
disp = PrecisionRecallDisplay(precision=precision, recall=recall)
disp = disp.plot(ax[0],name=f'{name} (area,a.k.a.,ap = {pr_auc:0.2f}')
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
ax[1].plot(fpr, tpr, lw=2, label=f'{name} (area = {roc_auc:0.2f})')
ax[1].plot([0, 1], [0, 1], 'k--')
ax[1].set_xlim([0.0, 1.0])
ax[1].set_ylim([0.0, 1.05])
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate (Recall)')
ax[1].set_title('Receiver operating characteristic Curve')
ax[1].legend(loc="lower right")
# ax[2].plot([0, 1], [0, 1], 'k--')
# ax[2].set_xlim(0, 0.4)
# ax[2].set_ylim(0.6, 1)
# ax[2].plot(fpr, tpr, lw=2, label=f'{name} (area = {roc_auc:0.4f})')
# ax[2].set_xlabel('False Positive Rate')
# ax[2].set_ylabel('True Positive Rate (Recall)')
# ax[2].set_title('Receiver operating characteristic Curve (zoomed in at top left)')
# ax[2].legend(loc='best')
plt.subplots_adjust(wspace=0, hspace=0)
plt.tight_layout()
return ax
fig,ax = plt.subplots(1,2,figsize = (12,6),dpi = 120)
for df_plot,name in zip([df_cv,df_gold],['Logistic','LGB']):
mask = df_plot['y'].notna()&df_plot['predicted_y_proba'].notna()
y_true = df_plot['y'][mask]
y_scores = df_plot['predicted_y_proba'][mask]
binary_plotting(y_true,y_scores,name = name,ax=ax);
Lastly, I aggregate predicted probability.
df_eval=df.merge(df_cv[['index','predicted_y','predicted_y_proba']].rename(columns={'predicted_y':'pred_y_Ridge',
'predicted_y_proba':'predicted_y_proba_Ridge'}),
left_index=True,right_on='index',how='inner',validate="1:1",suffixes=[False,False])\
.merge(df_gold[['index','predicted_y','predicted_y_proba']].rename(columns={'predicted_y':'pred_y_lgb_gold',
'predicted_y_proba':'predicted_y_lgb_gold'}),
how='inner',validate="1:1",suffixes=[False,False])\
.sort_values('created_at')
ts = df_eval[['create_at_date','y','predicted_y_proba_Ridge','predicted_y_lgb_gold']]\
.dropna().groupby('create_at_date').sum()
fig,ax=plt.subplots(figsize=(10,6),dpi=120)
ax.plot(ts['y'],label='Actual y')
ax.plot(ts['predicted_y_proba_Ridge'],label='Pred:Logistics',ls = 'dotted',color = 'red',alpha=0.7)
ax.plot(ts['predicted_y_lgb_gold'],label='Pred:LGB',ls = '--',color = 'green',alpha=0.8)
ax.legend()
ax.set_title('Total Y=1 Per Day');
def find_threshold(y_true,y_scores,roc = True,name = None):
fig,ax = plt.subplots(figsize = (6,4),dpi = 100)
if roc:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_true,y_scores)
# plot the roc curve for the model
ax.plot([0,1], [0,1], linestyle='--')
ax.plot(fpr, tpr, marker='.', label=name+ f' AUC = ({auc(fpr,tpr):.4f})')
# axis labels
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f, J-Stats=%.4f' % (best_thresh, J[ix]))
ax.scatter(fpr[ix], tpr[ix], marker='o', color='black',
s = 100,label=f'Best Threshold = {best_thresh:.4f}')
ax.plot([fpr[ix], fpr[ix]], [0., tpr[ix]], "y:")
ax.plot([0, fpr[ix]], [tpr[ix], tpr[ix]], "y:")
ax.set_xlim(0,1)
ax.set_ylim(0,1)
ax.legend()
else:
# calculate pr-curve
precision, recall, thresholds = precision_recall_curve(y_true,y_scores)
average_precision = average_precision_score(y_true,y_scores)
# convert to f score
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
best_thresh = thresholds[ix]
print('Best Threshold=%f, F-Score=%.4f' % (best_thresh, fscore[ix]))
# plot the pr-curve for the model
no_skill = len(y[y==1]) / len(y)
ax.plot([0,1], [no_skill,no_skill], linestyle='--')
ax.plot(recall, precision, marker='.', label=f'{name} AP = ({average_precision:.4f})')
ax.plot([recall[ix], recall[ix]], [0., precision[ix]], "y:")
ax.plot([0, recall[ix]], [precision[ix], precision[ix]], "y:")
ax.scatter(recall[ix], precision[ix], marker='o',
s = 100, color='black', label=f'Best Threshold = {best_thresh:.4f}')
# axis labels
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(0,1)
ax.set_ylim(0,1)
ax.legend()
# show the plot
return best_thresh
mask = df_gold['y'].notna()&df_gold['predicted_y_proba'].notna()
y_true = df_gold['y'][mask]
y_scores = df_gold['predicted_y_proba'][mask]
find_threshold(y_true,y_scores,roc=True,name='LGB');
find_threshold(y_true,y_scores,roc=False,name='LGB');
Best Threshold=0.592635, J-Stats=0.4328 Best Threshold=0.379901, F-Score=0.7985
(0.592635+0.379901)/2
0.48626800000000003
params_gold
{'num_leaves': 65,
'learning_rate': 0.1078034154456137,
'min_data_in_leaf': 817,
'num_boost_round': 54,
'early_stopping_rounds': None,
'verbose': -1,
'saved_feature_importance_type': 1,
'metric': 'binary_logloss',
'objective': 'binary'}
lgb_final = lgb.train(params_gold,lgb_data,
# fobj=qmse,
# feval=qmse_metric,
categorical_feature = categorical_features.tolist(),
verbose_eval=10)
df['y_scores'] = lgb_final.predict(X)
df['y_pred'] = df['y_scores']>= 0.5
y_true = df['y']
y_pred = df['y_pred']
y_scores = df['y_scores']
fig,ax = plt.subplots(1,2,figsize = (12,6),dpi = 120)
binary_plotting(y_true,y_scores,name = name,ax=ax);
ts = df[['create_at_date','y','y_scores']].dropna().groupby('create_at_date').sum()
fig,ax=plt.subplots(figsize=(10,6),dpi=100)
ax.plot(ts['y'],label='Actual y')
ax.plot(ts['y_scores'],label='Pred:LGB',ls = '--',color = 'green',alpha=0.8)
ax.legend()
ax.set_title('Total Y=1 Per Day');
Calibration curve to check the accuracy of probability prediction
name = 'LGB'
fig = plt.figure(1, figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
bscore = brier_score_loss(y_true, y_scores)
fraction_of_positives, mean_predicted_value = \
calibration_curve(y_true, y_scores, n_bins=10)
ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
label="%s (%1.3f)" % (name, bscore))
ax2.hist(y_scores, range=(0, 1), bins=10, label=name,
histtype="step", lw=2)
ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots (reliability curve)')
ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()
df_feature_importance = (
pd.DataFrame({
'feature': lgb_final.feature_name(),
'importance': lgb_final.feature_importance(),
})
.sort_values('importance', ascending=False)
.set_index('feature')
)
fig, ax = plt.subplots(figsize=(6,6),dpi=120)
df_feature_importance.plot.bar(ax=ax)
ax.set_title("Feature importances from LightGBM")
ax.set_ylabel("Total gains of splits using each feature")
# ax.set_yscale('log')
fig.tight_layout()
explainer = shap.TreeExplainer(lgb_final)
shap_values = explainer.shap_values(X)
# summarize the effects of all the features
shap.summary_plot(shap_values[1], X)
We can also see the partial dependence (directional) here:
mask = (X["total_order_per_onshift_dasher"]<7.5)&(X['total_items']<50)\
&(X['subtotal']<15000)&(X['min_item_price']<7000)&(X['estimated_order_place_duration']<1500)\
&(X['store_id_encoded']<10000)
for name in X.columns.values:#['total_order_per_onshift_dasher','estimated_store_to_consumer_driving_duration','subtotal']:
shap.dependence_plot(name, shap_values[1][mask], X[mask],
display_features=X[mask])